#!/usr/bin/python
# -*- coding: cp1251 -*-
r"""
NAME:
    pyDSL2HTM v. 0.21; last update: 07-Jan-2008

AUTHOR:
    zhuman 2012

DESCRIPTION:
    Parse Lingvo DSL source files (in Unicode UCS2-LE or ANSI
    encodings and:
    1. Detect entry and field boundaries and insert markers:
    '<!--~-->' = as entry delimiter
    '<!--=-->' = as field delimiter (separates headword and definition)

    2. Convert DSL markup to HTML, according to convresion tables
    3. Save resulting file to UTF-8 or other supported encodings.

    There are two output modes:

        -p = Pretty HTML (with HTML HEADER and styles).
             This mode is the default; '-p' may be omitted.

        -r = Raw HTML (for database import)
             Note: In the raw HTML mode the <DT> tag is not prepended to the definition.

    !!!TODO Add [s] tag for links [/s]

    Copyright (C) 2007 xhuman <xh@ua.fm>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by the
    Free Software Foundation; either version 2, or any later version.

    This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY
    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    for more details.


"""

import sys
import os
import re
import codecs
import time
#from inspect import currentframe

"""
Configration settings for pyDSL2HTM
"""
# Separators for entries and fields (for SQL import)
ENTRY_SEPARATOR = os.linesep + '<!--~-->'

FIELD_SEPARATOR = '<!--=-->'
END_OF_LINE = None
callback = None

#for cleanup
_in_file = None
_out_file = None

# Additional separators for entries and fields
# used for pretty HTML file mode
ADDITIONAL_ENTRY_SEPARATOR_PRETTY = '<DT>'
ADDITIONAL_FIELD_SEPARATOR_PRETTY = ''

# Separator for line breaks within the definition
NEW_LINE_MARKER = '<dd>'

# Strings to replace in HEADWORD only
HEADWORD_REPL = {
u'{\u2027}':'',  # for Collins 2006
r"{[']}":'',     # remove accents until there are some better ideas
r"{[/']}":'',
'\\':''
}


# Strings to replace on first pass
DSL2HTM_DICT_1ST_PASS = {
'<<':'[ref]',
'>>':'[/ref]',

}

# Strings to replace on second pass
DSL2HTM_DICT_2ND_PASS = {
'>':'&gt;',
'<':'&lt;',
r'[!trs]':'',
r'[/!trs]':''
}


# Strings to replace on third pass
DSL2HTM_DICT_3RD_PASS = {
    # opening tags to SPANs
    re.compile('\[(m|c|trn|t|com|p|ex|ref)\]'):'<span class="\\1">',
    re.compile('(\[\*\]|\{\{)'):'<span class="star">',       # '{{' and '[*]'
    re.compile('\[\'\]'):'<span class="stress">',
    re.compile('(\[ref\s[^\]]+\]|<<)'):'<span class="ref">',

    # closing SPANs
    re.compile('>>|\}\}'):'</span>', # '}}' and '>>'
    re.compile('\[/(?:c|m|trn|t|com|p|ex|ref|\'|\*|m)\]'):'</span>',

    #COLOR
    re.compile('\[c ([\w]+)\]'):'<span class="col_\\1">',

    # TO SELF: sub, sup, u, i, b (e.g. [sub] to <sub>)
    re.compile('\[(sup|sub|u|i|b)\]'):'<\\1>',
    re.compile('\[/(sup|sub|u|i|b)\]'):'</\\1>',

    # Indentation: m1, m2, m3, m4,... (e.g. [m1] to <span class="m1">)
    re.compile('\[m(\d)\]'):'<span class="m\\1">',
    
    # remove TAGS (replace with nothing)
    re.compile('\[/?(?:trn|lang)\]'):'',
    re.compile('\[lang\s[^\]]*\]'):'',

    # Remove escapes
    re.compile(r'\\'):'',

    # Remove dictionary specific strings
    re.compile('Trimis de.*Sursa:\s*' ): ''  # rodex
    # re.compile(u'{\u2027}'):''               # collins 2006
    #'\t':'',
}

# HTML and CSS for pretty HTML output
HTML_HEADER = r'''<HTML>
<HEAD>
            <TITLE></TITLE>
            <META HTTP-EQUIV=Content-Type CONTENT="text/html; charset=%s" />
            <style type="text/css">
            body {padding: 1px}
            p, body, table {font-family: "Arial Unicode MS", verdana, arial; font-size: 9pt;}
            table { border: 1px solid #cdc1ae; border-collapse: collapse; padding: 0px; margin: 1px;}
            TH, {border: 1px solid #cdc1ae;  }
            DT {text-align: left; border: 1px solid white; border-bottom: 1px solid #d5cdbf; border-right: 1px solid #e5dfce; color: #817e70;margin: 2px 80px ; padding: 5px 10px; background: #eff4e5; font-weight: bold}
            DD {text-align: left; margin: 0px 80px ; padding: 0px 110px; background: #f9f9fb; vertical-align: }
            dd p {margin: 0px; padding: 1px}
            TD {border: 1px solid #cdc1ae; padding: 2px;vertical-align: top;}
            TR { padding: 2px;background:#fbf8e9 ; }
            h1 {font-size: 13pt; color: #999999}
            pre {font-family: "Lucida Console", "Lucida Sans Unicode"}
            A:link{Color : #00008B; Font-Family : Arial, Helvetica, sans-serif;Text-Decoration : none; }
            A:visited{Color : #00008B;Font-Family : Arial, Helvetica, sans-serif;Text-Decoration : none;}
            A:hover{Color : #A0522D;Font-Family : Arial, Helvetica, sans-serif;Text-Decoration : none;}
            table.search {border: 0px; background: #e8e7d8 }
            table.search td {text-align: right; border: 0px}
            .info {color: #bbbbbb; padding: 3px}
            .even {background-color: #f5f0df}
            input {border:1px solid #a3a3a3;background: #fffbd5; margin: 5px}
            select {background-color: #f9f2f2; color: Olive; border:1px solid #d6d1c0;}
            .button {border:1px solid #d6d1c0; background: #e8e7d8 }
            .dbg {color:#eaebec}
            .prev{display: inline; float: left; text-align:left; width: 100px; }
            .next{display: inline; float: right; text-align: right; width: 100px; }
            .star {color:#395666}
            .ref{font-weight: bold; color: #009999}
            .m1{margin-left: 0px;}
            .m2{margin-left: 5px;}
            .m3{margin-left: 10px;}
            .m4{margin-left: 14px;}
            .m1{margin-left: 0px;}
            .m2{margin-left: 5px;}
            .m3{margin-left: 10px;}
            .m4{margin-left: 14px;}
            .m2 b i {color: #996300}
            .m3 b {color: darkblue}
            .p1 {color: #333b14}
            .col1{color: #660066}
            .col2{color: #006600}
            .col3{color: #857d49}
            .col4 {color: gray}
            .m3 .col4{color: darkgreen}
            .ex {color: #000099; font-family: "Trebuchet MS", "Times New Roman"}
            .com{color: darkred}
            </style>
</HEAD>
<BODY BGCOLOR="#FFFFFF" TET="#000000" LINK="#000099" VLINK="#330066" ALINK="#FF0000">
<DL>
'''

HTML_FOOTER =r'''
</DL>
</BODY>
</HTML>
'''
# ################################################################
# ################################################################
# ###################  END OF CONFIG SECTION  ####################
# ################################################################
# ################################################################


# DEFAULT ENCODINGS (detected at run-time, fall back to cp1251 on non-Unicode)
INPUT_ENCODING  = ''
OUTPUT_ENCODING = ''
PRETTY_HTML = True
DIRTY_CONVERSION = False

def Usage():
    print '''USAGE:
          %s [-i Encoding] [-o Encoding] [-p] [-r] <Dictionary_Unicode.DSL> [Dictionary_converted.TxT]
          Where:
          -i, --input-encoding = input file encoding
          -o, --output-encoding = output file encoding
          -p, --pretty  = add HTML header and footer (default)
          -r, --raw     = don't add HTML header and footer

          Available encodings:
          utf8        8-bit variable length encoding
          utf_16      16-bit variable length encoding (little/big endian)
          utf_16_le   utf-16 but explicitly little endian
          utf_16_be   utf-16 but explicitly big endian
          cp1251      Windows Cyrillic
          cp866       OEM Cyrillic (DOS)
          ascii       7-bit ASCII codepage
          iso-8859-1  ISO 8859-1 (Latin 1) codepage
          For a complete list of encodings see:
          http://docs.python.org/lib/standard-encodings.html

EXAMPLES:

          pyDSL2HTM oxford.dsl
          converts a Lingvo DSL Unicode file to HTML (UTF-8)

          pyDSL2HTM -o 1251 oxford.dsl
          converts a Lingvo DSL Unicode file to HTML (Windows-1251)''' % os.path.split(sys.argv[0])[-1]

def parseCommandLine():
    '''process command line parameters'''
    import getopt
    global INPUT_ENCODING, OUTPUT_ENCODING, PRETTY_HTML
    try:
        opts, args = getopt.getopt ( sys.argv[1:], "i:o:hpr",
                                     ["input-encoding=",
                                      "output-encoding=",
                                      "help",
                                      "pretty",
                                      "raw"
                                      ]
                                     )
    except getopt.GetoptError:
        # print help information and exit
        usage()
        sys.exit(2)
    #output = None
    #verbose = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            Usage()
            #sys.exit()
        if opt in ("-i", "--input-encoding"):
            INPUT_ENCODING = arg
        if opt in ("-o", "--output-encoding"):
            OUTPUT_ENCODING = arg
        if opt in ("-p", "--pretty"):
            PRETTY_HTML = True
        if opt in ("-r", "--raw"):
            PRETTY_HTML = False
    #print '''DBG: opts=%s\nargs=%s''' % (opts, args)
    return args

def calc_eta(start, now):
    eta = now - start
    eta_mins = eta / 60
    eta_secs = eta % 60
    if eta_mins > 99:
        return '--:--'
    return '%02d:%02d' % (eta_mins, eta_secs)

def isFileUnicode(file_name, BOMbyte='\xFF'):
    '''Detect if current file is Unicode by presence of BOM or nulls'''
    f = open(file_name, 'rb')
    BOM_found = f.read(1) == BOMbyte and True or False
    f.close()
    return BOM_found

def detectLineTerminator(file_name):
    f = open(file_name, 'rb')
    line = f.readline()
    LINE_TERMINATOR =""
    if '\r' in line:
        LINE_TERMINATOR += '\r'
    if '\n' in line:
        LINE_TERMINATOR += '\n'
    f.close()
    return LINE_TERMINATOR


def str_replace( text, dic ):
    keys = dic.keys()
    keys.sort()
    for n in keys:
        text = text.replace(n,dic[n])
    return text

def str_replace_re( text, dic ):
    """do multiple replace in text according
    to settings defined in dic"""

    for pat, repl in dic.iteritems():
        text = re.sub(pat, repl, text) #  text.replace(n,dic[n])
    return text

##def multiple_replace(text, dic):
##
##  """ Replace in 'text' all occurences of any key in the given
##  dictionary by its corresponding value.  Returns the new string."""
##
##  # Create a regular expression  from the dictionary keys
##  regex = re.compile("(%s)" % "|".join(map(re.escape, dic.keys())))
##  #regex = re.compile("(%s)" % "|".join( dic.keys()))
##  # For each match, look-up corresponding value in dictionary
##  return regex.sub(lambda mo: dic[mo.string[mo.start():mo.end()]], text)
##
### data = codecs.open (, 'r', 'utf_16_le').read()

def do_DSL2HTML_conv(data):

    # 1st pass replacement...'
    data = str_replace(data, DSL2HTM_DICT_1ST_PASS)

    # 2nd pass replacement...'
    data = str_replace(data, DSL2HTM_DICT_2ND_PASS)

    # 3rd pass replacement... (using regex version)'
    data = str_replace_re(data, DSL2HTM_DICT_3RD_PASS)

    return data

def write_entry(ofile, headWord, definition):
    'Write an HTML formatted entry to output file'

    definition = do_DSL2HTML_conv(definition)


    definition = definition.replace(END_OF_LINE, "\n" + NEW_LINE_MARKER)


    # headWord.replace   (u'{\u2027}', '')
    headWord = str_replace(headWord, HEADWORD_REPL)

    # remove escapes in headwords
    if '\\' in headWord:
        headWord = headWord.replace('\\', '')
    # _________________________________________________________________________
    # .........................................................................
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #       <!--~-->          aadvark    <!--=-->          <dd>              a badger-sized African mammal
    # .........................................................................
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    entry = ENTRY_SEPARATOR + headWord + FIELD_SEPARATOR + NEW_LINE_MARKER + definition
    # .........................................................................
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


    try:
        ofile.write(entry.encode(OUTPUT_ENCODING))
    except UnicodeEncodeError, e:
        callback_print (e + "  Please use '-o utf-8'!")
        DIRTY_CONVERSION = True

def callback_print(data, overwriteLine=False):
    if callback:
        callback(data, overwritePreviousLine=overwriteLine)
        if callback.func_dict['TERMINATE'] == True:
            #!!!!!!! CLEAN_UP()
            sys.stdout.write('Got Terminate REQUEST!')
            callback('Conversion aborted', True, True)
            cleanExit()
            callback.func_dict['CLEANUP_COMPLETE'] == True
            sys.exit(-2)

#	callback(currentframe().f_lineno,currentframe().f_code.co_filename)

    if sys.stdout:
        sys.stdout.write((overwriteLine and '\r' or '') + data)
        if not overwriteLine: sys.stdout.write('\n')

def detect_encodings(input_file):
    global INPUT_ENCODING
    global OUTPUT_ENCODING

    if not INPUT_ENCODING or not OUTPUT_ENCODING:

        if isFileUnicode(input_file) or isFileUnicode(input_file, '\x00'): # Unicode UCS2 little endian \xFF
            INPUT_ENCODING  = 'utf_16'
            if not OUTPUT_ENCODING:
                OUTPUT_ENCODING = 'utf-8'

        elif isFileUnicode(input_file, '\xEF'):  # UTF-8? '\xEF'
            INPUT_ENCODING  = 'utf-8'
            if not OUTPUT_ENCODING:
                OUTPUT_ENCODING = 'utf-8'
        else:
            INPUT_ENCODING  = 'cp1251'
            if not OUTPUT_ENCODING:
                OUTPUT_ENCODING = 'cp1251'

    callback_print('Input encoding  = %s' % INPUT_ENCODING)
    callback_print('Output encoding = %s' % OUTPUT_ENCODING)
    ## codecs.BOM_UTF16_LE:
    ###############################################

def cleanExit():
    if _in_file and not _in_file.closed:
        _in_file.close()
    if _out_file and not _out_file.closed:
        _out_file.close()



# #######################################################
# ###########   conversion routine ######################
# #######################################################
def convert_dsl(input_file=None,
                output_file=None,
                in_enc=None,
                out_enc=None,
                pretty_html=False,
                callback_func=None):

    global INPUT_ENCODING
    global OUTPUT_ENCODING
    global callback
    global PRETTY_HTML
    global ENTRY_SEPARATOR
    global FIELD_SEPARATOR
    global END_OF_LINE
    global _in_file
    global _out_file

    detect_encodings(input_file)

    INPUT_ENCODING = in_enc or INPUT_ENCODING
    OUTPUT_ENCODING = out_enc or OUTPUT_ENCODING
    PRETTY_HTML = pretty_html or PRETTY_HTML

    callback = callback_func

    END_OF_LINE = detectLineTerminator(input_file)

    if PRETTY_HTML:
        ENTRY_SEPARATOR += ADDITIONAL_ENTRY_SEPARATOR_PRETTY
        FIELD_SEPARATOR += ADDITIONAL_FIELD_SEPARATOR_PRETTY

    #out_file = codecs.open(out_file_name, 'wb', 'utf8')
    in_file = codecs.open (input_file, 'rb', encoding=INPUT_ENCODING)
    out_file = open(output_file, 'wb')

    _in_file = in_file
    _out_file = out_file

    if OUTPUT_ENCODING == 'utf-8':
        out_file.write( codecs.BOM_UTF8 )

    if PRETTY_HTML:
        # Write HTML_HEADER and insert appropriate HTML charset
        out_file.write( (HTML_HEADER % OUTPUT_ENCODING).encode(OUTPUT_ENCODING) )

    callback_print('Output file:\t%s ("%s")\n' % (output_file, OUTPUT_ENCODING))
    #print "For big files conversion might take a long time...\nPlease wait..."
    #print "Start time:\t" + time.strftime("%H:%M:%S", time.localtime())

    #print 'Writing file:\n%s ["%s"]' % (os.path.abspath(out_file_name), OUTPUT_ENCODING)
    #print "Please wait...\n"
    #print "Processing started at:\t" + time.strftime("%H:%M:%S", time.localtime())

    startTime = time.time()

    ##########################
    # LINE-BASED IO
    ##########################

    headWord = ""
    nextHeadWord = ""
    definition = ""
    input_encoding='utf_16_le'
    output_encoding='utf8'
    endOfEntry = False
    entryCount = 0

    for line in in_file:

        line_stripped =line.strip()

        if not len(line_stripped):
            continue

        if line[0] == ("#"):
            continue

        if line[0] not in [" ","\t"]: # is a HEADWORD
            if not headWord:
                headWord = line_stripped

            else:  # reached the end of entry
                endOfEntry = True

                nextHeadWord = line_stripped # save next headword

        elif headWord:   # is a DEFINITION line
            definition += line_stripped + os.linesep

        if endOfEntry:
            write_entry(out_file, headWord, definition.strip())
            entryCount += 1
            definition =""
            headWord = nextHeadWord
            endOfEntry = False

            if not entryCount % 1024:
                callback_print("Entries: %d \tElapsed time: %s" %
                                (entryCount,
                                 calc_eta(startTime, time.time())
                                 ),
                                overwriteLine=True)


    # write last entry
    write_entry(out_file, headWord, definition.strip())

    if PRETTY_HTML:
        out_file.write( HTML_FOOTER.encode(OUTPUT_ENCODING))


    if DIRTY_CONVERSION:
        callback_print ('''WARNING: Data was lost during converstion! The selected output encoding "%s" does not support all the characters in the source file!''')

    elapsedTime = "%02d:%02d" % divmod(int(time.time() - startTime), 60)

    callback_print("Total entries processed: %d   Elapsed time: %s" % (entryCount, elapsedTime))


if __name__ == "__main__":
    args = parseCommandLine()
    in_file_name = None
    out_file_name = None

    if len(args) == 0:
        Usage()
        sys.exit(-2)

    if len(args) > 0:
        in_file_name = args[0]

    detect_encodings(in_file_name)

    if len(args) >1:
        out_file_name = args[1]

    else:
        out_file_name = os.path.splitext (os.path.split(in_file_name)[-1])[0] \
                      + '_' + OUTPUT_ENCODING + '.HTM'

    if os.path.exists(out_file_name):

        if raw_input("\rFile " + out_file_name + " already exists! \nOverwrite? Y/N ").upper() == 'N':
            sys.exit(-2)
        else:
            os.sys.stdout.write("\n")
    print 'Input file:\t%s ("%s")' % (os.path.basename(in_file_name), INPUT_ENCODING)

    convert_dsl(in_file_name,
                out_file_name,
                in_enc=INPUT_ENCODING,
                out_enc=OUTPUT_ENCODING,
                pretty_html=PRETTY_HTML)

